knitr::opts_chunk$set(
  echo = TRUE,
  error = FALSE,
  comment = "#>",
  fig.path = "img/",
  fig.retina = 2,
  fig.width = 10,
  fig.asp = 3/4, 
  fig.height = 20,
  fig.pos = "t",
  fig.align = "center",
  dpi = 150,
  out.width = "90%",
  dev.args = list(png = list(type = "cairo-png")),
  optipng = "-o1 -quiet"
)

1 cluster+rank+predict

1.0.0.1 Loading Required Packages

library(tidyverse)
#> ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
#> ✔ dplyr     1.1.2     ✔ readr     2.1.4
#> ✔ forcats   1.0.0     ✔ stringr   1.5.0
#> ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
#> ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
#> ✔ purrr     1.0.1     
#> ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
#> ✖ dplyr::filter() masks stats::filter()
#> ✖ dplyr::lag()    masks stats::lag()
#> ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(factoextra)
#> Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(scorecard)
#> 
#> Attaching package: 'scorecard'
#> 
#> The following object is masked from 'package:tidyr':
#> 
#>     replace_na
library(glmnet)
#> Loading required package: Matrix
#> 
#> Attaching package: 'Matrix'
#> 
#> The following objects are masked from 'package:tidyr':
#> 
#>     expand, pack, unpack
#> 
#> Loaded glmnet 4.1-8
library(ggplot2)
library(plotly)
#> 
#> Attaching package: 'plotly'
#> 
#> The following object is masked from 'package:ggplot2':
#> 
#>     last_plot
#> 
#> The following object is masked from 'package:stats':
#> 
#>     filter
#> 
#> The following object is masked from 'package:graphics':
#> 
#>     layout
library(dplyr)
library(xefun)
library(modeest)
#> Registered S3 method overwritten by 'rmutil':
#>   method         from
#>   print.response httr
library(cluster)
library(GA)
#> Loading required package: foreach
#> 
#> Attaching package: 'foreach'
#> 
#> The following objects are masked from 'package:purrr':
#> 
#>     accumulate, when
#> 
#> Loading required package: iterators
#> Package 'GA' version 3.2.4
#> Type 'citation("GA")' for citing this R package in publications.
#> 
#> Attaching package: 'GA'
#> 
#> The following object is masked from 'package:utils':
#> 
#>     de
library(dendextend)
#> 
#> ---------------------
#> Welcome to dendextend version 1.17.1
#> Type citation('dendextend') for how to cite the package.
#> 
#> Type browseVignettes(package = 'dendextend') for the package vignette.
#> The github page is: https://github.com/talgalili/dendextend/
#> 
#> Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
#> You may ask questions at stackoverflow, use the r and dendextend tags: 
#>   https://stackoverflow.com/questions/tagged/dendextend
#> 
#>  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
#> ---------------------
#> 
#> 
#> Attaching package: 'dendextend'
#> 
#> The following object is masked from 'package:stats':
#> 
#>     cutree
library(parallel)
library(ROCR)
library(gridExtra)
#> 
#> Attaching package: 'gridExtra'
#> 
#> The following object is masked from 'package:dplyr':
#> 
#>     combine
library(grid)
library(writexl)
library(openxlsx)
library(clusterSim)
#> Loading required package: MASS
#> 
#> Attaching package: 'MASS'
#> 
#> The following object is masked from 'package:plotly':
#> 
#>     select
#> 
#> The following object is masked from 'package:dplyr':
#> 
#>     select
library(ROCR)
library(verification)
#> Loading required package: fields
#> Loading required package: spam
#> Spam version 2.9-1 (2022-08-07) is loaded.
#> Type 'help( Spam)' or 'demo( spam)' for a short introduction 
#> and overview of this package.
#> Help for individual functions is also obtained by adding the
#> suffix '.spam' to the function name, e.g. 'help( chol.spam)'.
#> 
#> Attaching package: 'spam'
#> 
#> The following object is masked from 'package:Matrix':
#> 
#>     det
#> 
#> The following objects are masked from 'package:base':
#> 
#>     backsolve, forwardsolve
#> 
#> Loading required package: viridisLite
#> 
#> Try help(fields) to get started.
#> 
#> Attaching package: 'fields'
#> 
#> The following object is masked from 'package:xefun':
#> 
#>     ceiling2
#> 
#> The following object is masked from 'package:scorecard':
#> 
#>     describe
#> 
#> Loading required package: boot
#> Loading required package: CircStats
#> Loading required package: dtw
#> Loading required package: proxy
#> 
#> Attaching package: 'proxy'
#> 
#> The following object is masked from 'package:spam':
#> 
#>     as.matrix
#> 
#> The following object is masked from 'package:Matrix':
#> 
#>     as.matrix
#> 
#> The following objects are masked from 'package:stats':
#> 
#>     as.dist, dist
#> 
#> The following object is masked from 'package:base':
#> 
#>     as.matrix
#> 
#> Loaded dtw v1.23-1. See ?dtw for help, citation("dtw") for use in publication.
library(pROC)
#> Registered S3 method overwritten by 'pROC':
#>   method    from        
#>   lines.roc verification
#> Type 'citation("pROC")' for a citation.
#> 
#> Attaching package: 'pROC'
#> 
#> The following object is masked from 'package:verification':
#> 
#>     lines.roc
#> 
#> The following objects are masked from 'package:stats':
#> 
#>     cov, smooth, var
library(xgboost)
#> 
#> Attaching package: 'xgboost'
#> 
#> The following object is masked from 'package:plotly':
#> 
#>     slice
#> 
#> The following object is masked from 'package:dplyr':
#> 
#>     slice
library(Matrix)
library(dbscan)
#> 
#> Attaching package: 'dbscan'
#> 
#> The following object is masked from 'package:stats':
#> 
#>     as.dendrogram
library(knitr)

1.1 part1_Import Data

# Load the data
train_data <- read.csv("C:/Users/lenovo/Downloads/train.csv")

# Preview the data
str(train_data)
#> 'data.frame':    81738 obs. of  21 variables:
#>  $ loan_amnt            : int  5000 2400 5000 3000 5600 5375 6500 9000 3000 10000 ...
#>  $ funded_amnt          : int  5000 2400 5000 3000 5600 5375 6500 9000 3000 10000 ...
#>  $ pymnt_plan           : chr  "n" "n" "n" "n" ...
#>  $ grade                : chr  "B" "C" "A" "E" ...
#>  $ sub_grade_num        : num  0.4 1 0.8 0.2 0.4 1 0.6 0.2 0.2 0.4 ...
#>  $ short_emp            : int  0 0 0 0 0 1 0 1 0 0 ...
#>  $ emp_length_num       : int  11 11 4 10 5 1 6 1 4 4 ...
#>  $ home_ownership       : chr  "RENT" "RENT" "RENT" "RENT" ...
#>  $ dti                  : num  27.65 8.72 11.2 5.35 5.55 ...
#>  $ purpose              : chr  "credit_card" "small_business" "wedding" "car" ...
#>  $ payment_inc_ratio    : num  8.14 8.26 5.22 2.74 4.57 ...
#>  $ delinq_2yrs          : int  0 0 0 0 0 0 0 0 0 0 ...
#>  $ delinq_2yrs_zero     : int  1 1 1 1 1 1 1 1 1 1 ...
#>  $ inq_last_6mths       : int  1 2 3 2 2 0 2 1 2 2 ...
#>  $ last_delinq_none     : int  1 1 1 1 1 1 1 1 1 1 ...
#>  $ last_major_derog_none: int  1 1 1 1 1 1 1 1 1 1 ...
#>  $ open_acc             : int  3 2 9 4 11 2 14 4 11 14 ...
#>  $ pub_rec              : int  0 0 0 0 0 0 0 0 0 0 ...
#>  $ pub_rec_zero         : int  1 1 1 1 1 1 1 1 1 1 ...
#>  $ revol_util           : num  83.7 98.5 28.3 87.5 32.6 36.5 20.6 91.7 43.1 55.5 ...
#>  $ bad_loans            : int  0 0 0 0 1 1 0 1 0 1 ...
purpose_counts <- table(train_data$purpose)
print(purpose_counts)
#> 
#>                car        credit_card debt_consolidation   home_improvement 
#>               1570              14722              45428               4990 
#>              house     major_purchase            medical             moving 
#>                665               2580               1085                792 
#>              other     small_business           vacation            wedding 
#>               6107               2173                587               1039
home_ownership_counts <- table(train_data$home_ownership)
print(home_ownership_counts)
#> 
#> MORTGAGE    OTHER      OWN     RENT 
#>    39583      116     6639    35400
grade_counts <- table(train_data$grade)
print(grade_counts)
#> 
#>     A     B     C     D     E     F     G 
#> 14812 24775 19928 12847  6022  2611   743
# Check for missing values and duplicates
sum(is.na(train_data))
#> [1] 97
# Convert categorical data to numeric
train_data <- train_data %>%
  mutate(
    grade = as.numeric(factor(grade), levels = c("A", "B", "C", "D", "E", "F" , "G")),
    purpose = as.numeric(factor(purpose), levels = c("car", "credit_card", "debt_consolidation", "home_improvement", "house", "major_purchase","medical","moving", "other", "small_business", "vacation", "wedding")),
    home_ownership = as.numeric(factor(home_ownership), levels = c("MORTGAGE", "OTHER", "OWN", "RENT"))
  )

# Remove unnecessary columns
train_data <- dplyr::select(train_data, -pymnt_plan)

# Check for missing values and handle them
na_counts <- sapply(train_data, function(x) sum(is.na(x)))
print(na_counts)
#>             loan_amnt           funded_amnt                 grade 
#>                     0                     0                     0 
#>         sub_grade_num             short_emp        emp_length_num 
#>                     0                     0                     0 
#>        home_ownership                   dti               purpose 
#>                     0                     0                     0 
#>     payment_inc_ratio           delinq_2yrs      delinq_2yrs_zero 
#>                     1                    16                    16 
#>        inq_last_6mths      last_delinq_none last_major_derog_none 
#>                    16                     0                     0 
#>              open_acc               pub_rec          pub_rec_zero 
#>                    16                    16                    16 
#>            revol_util             bad_loans 
#>                     0                     0
set.seed(1)
total_rows <- nrow(train_data)
subset_size <- total_rows / 2
random_subset <- train_data %>%
  sample_n(subset_size)
cat("Total rows:", total_rows, "\n")
#> Total rows: 81738
cat("Subset size:", subset_size, "\n")
#> Subset size: 40869
print(head(random_subset))
#>   loan_amnt funded_amnt grade sub_grade_num short_emp emp_length_num
#> 1     12000       12000     3           1.0         0              4
#> 2     18550       18550     4           0.4         0              5
#> 3     10000       10000     4           0.6         0              5
#> 4      9600        9600     1           0.4         1              0
#> 5     12000       12000     1           0.4         0             11
#> 6      6000        6000     5           0.4         0              6
#>   home_ownership   dti purpose payment_inc_ratio delinq_2yrs delinq_2yrs_zero
#> 1              4  9.14       3          11.76200           0                1
#> 2              4 18.64       7           8.77438           0                1
#> 3              1 10.87       2           6.54269           0                1
#> 4              4 19.14       3           5.89520           1                0
#> 5              1  4.20      10           6.23880           0                1
#> 6              4 19.92       3           4.27303           0                1
#>   inq_last_6mths last_delinq_none last_major_derog_none open_acc pub_rec
#> 1              5                0                     1       15       1
#> 2              1                1                     1        4       0
#> 3              3                1                     1        8       0
#> 4              1                0                     1       13       0
#> 5              0                1                     1       14       0
#> 6              6                0                     1       16       0
#>   pub_rec_zero revol_util bad_loans
#> 1            0       63.5         0
#> 2            1       47.1         0
#> 3            1       50.8         0
#> 4            1       23.7         0
#> 5            1        6.0         0
#> 6            1       45.1         0
# dataframe
train_data1 <- random_subset

1.2 part2_Iv

iv = iv(train_data1, y = 'bad_loans') %>%
  as_tibble() %>%
  mutate( info_value = round(info_value, 3) ) %>%
  arrange( desc(info_value) )

iv %>%
  knitr::kable()
variable info_value
dti 0.485
grade 0.329
revol_util 0.252
loan_amnt 0.236
funded_amnt 0.234
payment_inc_ratio 0.081
purpose 0.042
inq_last_6mths 0.041
home_ownership 0.017
emp_length_num 0.010
open_acc 0.008
short_emp 0.004
delinq_2yrs 0.003
pub_rec 0.001
delinq_2yrs_zero 0.001
sub_grade_num 0.001
last_delinq_none 0.000
last_major_derog_none 0.000
pub_rec_zero 0.000

1.3 part3_WOE

bins = woebin(train_data1, y = 'bad_loans')
#> ℹ Creating woe binning ...
#> ✔ Binning on 40869 rows and 20 columns in 00:00:12
variables <- names(bins[])
train_woe <- woebin_ply(train_data1, bins)
#> ℹ Converting into woe values ...
#> ✔ Woe transformating on 40869 rows and 19 columns in 00:00:11

1.4 part4_feature_selection

1.4.1 PCA

df_train <- train_woe

# Remove unnecessary columns
df1 <- dplyr::select(df_train, -bad_loans)

# Perform PCA
data_pca <- prcomp(df1, center = TRUE, scale. = F)
summary(data_pca)
#> Importance of components:
#>                           PC1    PC2    PC3     PC4     PC5     PC6     PC7
#> Standard deviation     0.6260 0.3721 0.2820 0.22881 0.20052 0.17533 0.15299
#> Proportion of Variance 0.5004 0.1768 0.1016 0.06685 0.05135 0.03926 0.02989
#> Cumulative Proportion  0.5004 0.6772 0.7788 0.84562 0.89696 0.93622 0.96610
#>                            PC8     PC9    PC10    PC11    PC12    PC13    PC14
#> Standard deviation     0.12092 0.08631 0.04382 0.03642 0.02566 0.01404 0.01132
#> Proportion of Variance 0.01867 0.00951 0.00245 0.00169 0.00084 0.00025 0.00016
#> Cumulative Proportion  0.98478 0.99429 0.99674 0.99843 0.99927 0.99953 0.99969
#>                            PC15     PC16     PC17      PC18      PC19
#> Standard deviation     0.009821 0.009364 0.007662 0.0006635 2.419e-16
#> Proportion of Variance 0.000120 0.000110 0.000070 0.0000000 0.000e+00
#> Cumulative Proportion  0.999810 0.999920 1.000000 1.0000000 1.000e+00
loadings <- data_pca$rotation
head(loadings)
#>                             PC1          PC2          PC3          PC4
#> loan_amnt_woe       0.073925136 -0.253820895  0.063586182 -0.198899745
#> funded_amnt_woe     0.073912131 -0.251889280  0.062201727 -0.195292233
#> grade_woe           0.945146341  0.221287928  0.199161724  0.035117355
#> sub_grade_num_woe  -0.005563003 -0.005599729 -0.009461251 -0.002620156
#> short_emp_woe      -0.002213074 -0.001496193  0.005141298  0.004143282
#> emp_length_num_woe -0.002408340 -0.001046150  0.006117357  0.004535543
#>                             PC5          PC6          PC7         PC8
#> loan_amnt_woe      -0.596521308  0.083046755 -0.070652246  0.16337237
#> funded_amnt_woe    -0.585355665  0.082238273 -0.068350612  0.16071107
#> grade_woe           0.032423462  0.105600793  0.057545178 -0.03161873
#> sub_grade_num_woe  -0.004397023 -0.013970272 -0.011040741  0.01266304
#> short_emp_woe       0.044973686 -0.008747212 -0.003109089  0.06880736
#> emp_length_num_woe  0.049823527 -0.009151991 -0.003684869  0.07775236
#>                             PC9          PC10         PC11          PC12
#> loan_amnt_woe      -0.024335682  0.0044949026 -0.003463462 -0.0037211962
#> funded_amnt_woe    -0.024055461  0.0030525030  0.001655281  0.0140260871
#> grade_woe          -0.004505497 -0.0008711838 -0.011588331 -0.0116795853
#> sub_grade_num_woe   0.001316874  0.0092330898  0.056617537 -0.9974973244
#> short_emp_woe      -0.693738384  0.0058779791  0.001623484  0.0007030922
#> emp_length_num_woe -0.709286624  0.0054178875  0.003796025 -0.0005814363
#>                             PC13         PC14          PC15         PC16
#> loan_amnt_woe       0.6992147436 -0.049363944 -0.0101610460  0.002800952
#> funded_amnt_woe    -0.7113296330  0.047644996  0.0111328205  0.001980115
#> grade_woe           0.0006441983  0.004242974  0.0001074442 -0.002173667
#> sub_grade_num_woe  -0.0106377005  0.025865070  0.0001360339 -0.014421276
#> short_emp_woe      -0.0152838291 -0.074032288 -0.6886503038 -0.178334445
#> emp_length_num_woe  0.0148283926  0.067794163  0.6727614217  0.175270894
#>                            PC17          PC18          PC19
#> loan_amnt_woe      -0.001259353  4.512122e-05  2.014199e-15
#> funded_amnt_woe    -0.001106719 -4.082085e-05 -7.996344e-16
#> grade_woe           0.001585032  8.722092e-06  9.584531e-17
#> sub_grade_num_woe   0.007808105  1.586855e-05  3.545324e-16
#> short_emp_woe      -0.006202354 -3.354520e-04  4.690120e-16
#> emp_length_num_woe  0.005881672  6.461365e-05 -5.546858e-16
fviz_eig(data_pca, addlabels = TRUE)

# Check the PCA output
head(data_pca$x)
#>             PC1         PC2       PC3         PC4          PC5          PC6
#> [1,]  0.2921316 -0.19089166 0.1057117 -0.05820084  0.219797540 -0.412702689
#> [2,]  0.5134660 -0.03691616 0.2125867  0.11715191 -0.304435505  0.095277648
#> [3,]  0.5207891  0.18552810 0.1064171 -0.05423267  0.007266799 -0.377334274
#> [4,] -1.1263020  0.07704632 0.1638929  0.32977103 -0.098309680  0.007099912
#> [5,] -1.1771336  0.15374169 0.3223907 -0.14675212 -0.013589018  0.164693871
#> [6,]  0.8368147  0.58757265 0.2756430  0.36246019  0.045630945 -0.191133919
#>              PC7         PC8         PC9         PC10         PC11
#> [1,] -0.05091001  0.09821871  0.06706506  0.028341563 -0.005671512
#> [2,]  0.18346330  0.20878863  0.03659958  0.059627400 -0.022679101
#> [3,]  0.19399641 -0.10711691  0.02778540 -0.003411623 -0.020516487
#> [4,] -0.06360999  0.19311549 -0.21201255 -0.055867561  0.097186839
#> [5,] -0.31250876 -0.15023069  0.02451762  0.028734360 -0.016472905
#> [6,]  0.01093339  0.11298191  0.05369666 -0.053209611 -0.023127403
#>               PC12          PC13         PC14         PC15         PC16
#> [1,] -0.0210751094 -0.0009599073 -0.012285314 -0.008755016  0.023993526
#> [2,]  0.0045274794 -0.0013699203  0.008656917 -0.003202198 -0.002783650
#> [3,] -0.0077617866  0.0004323956  0.009952554 -0.002231300 -0.004361317
#> [4,]  0.0209509095 -0.0005126908  0.003215737 -0.003733679 -0.001526565
#> [5,]  0.0111541083 -0.0014094692  0.004725085 -0.002400555 -0.001721166
#> [6,] -0.0002443128 -0.0013473640 -0.003498016 -0.001031571 -0.009234172
#>              PC17          PC18          PC19
#> [1,] -0.017875967 -2.174590e-05  2.755393e-16
#> [2,]  0.003341090  1.209072e-05  3.543329e-16
#> [3,]  0.004376651  3.914620e-05 -5.839496e-17
#> [4,] -0.007516465 -6.584101e-05 -2.862365e-16
#> [5,]  0.004050195 -5.141151e-05 -1.828037e-16
#> [6,] -0.011053840  4.810600e-05 -1.036341e-16
eig.val<-get_eigenvalue(data_pca)
eig.val
#>          eigenvalue variance.percent cumulative.variance.percent
#> Dim.1  3.918482e-01     5.003707e+01                    50.03707
#> Dim.2  1.384765e-01     1.768276e+01                    67.71983
#> Dim.3  7.953661e-02     1.015643e+01                    77.87626
#> Dim.4  5.235333e-02     6.685260e+00                    84.56152
#> Dim.5  4.020963e-02     5.134570e+00                    89.69609
#> Dim.6  3.074142e-02     3.925526e+00                    93.62162
#> Dim.7  2.340642e-02     2.988883e+00                    96.61050
#> Dim.8  1.462112e-02     1.867045e+00                    98.47754
#> Dim.9  7.449615e-03     9.512788e-01                    99.42882
#> Dim.10 1.919864e-03     2.451571e-01                    99.67398
#> Dim.11 1.326062e-03     1.693315e-01                    99.84331
#> Dim.12 6.586397e-04     8.410501e-02                    99.92742
#> Dim.13 1.970018e-04     2.515616e-02                    99.95257
#> Dim.14 1.281299e-04     1.636155e-02                    99.96893
#> Dim.15 9.645838e-05     1.231726e-02                    99.98125
#> Dim.16 8.767756e-05     1.119599e-02                    99.99245
#> Dim.17 5.870826e-05     7.496753e-03                    99.99994
#> Dim.18 4.402401e-07     5.621647e-05                   100.00000
#> Dim.19 5.852888e-32     7.473848e-30                   100.00000
# Determine the optimal number of clusters using the Elbow method on PCA components
data_pca_final<-prcomp(df1, center=FALSE, scale.=FALSE, rank. = 3)
results <- data_pca_final$x

1.5 part5_Clustering

1.5.1 K-means on all result of pca

set.seed(2)
wss1 <- function(k) {
  kmeans(results, centers = k, iter.max = 100, nstart = 50)$tot.withinss
}
k.values <- 1:10
wss_values1 <- sapply(k.values, wss1)
wss_values1
#>  [1] 24923.572 14400.046 10627.189  8140.346  6908.100  5934.548  5332.015
#>  [8]  4739.680  4306.678  3945.009
# Plot the WSS values for each number of clusters
plot(k.values, wss_values1, type = 'b', xlab = 'Number of Clusters', ylab = 'Total Within-Cluster Sum of Squares', main = 'Elbow Method')

# Perform k-means clustering
set.seed(1)
km_res <- kmeans(results, centers = 3, nstart = 100)
fviz_cluster(km_res, data = results) +
  scale_color_manual(values = c('steelblue3', 'sandybrown', '#9CDB9E')) +
  scale_fill_manual(values = c('steelblue3', 'sandybrown', '#9CDB9E')) +
      ggtitle("3 Cluster for result of pca") +
  theme_minimal()

Customers_Segments <- data.frame(results, cluster = as.factor(km_res$cluster))

km_res$size
#> [1]  7309 19836 13724
km_res$centers
#>          PC1         PC2          PC3
#> 1 -1.1658261 -0.08222403 -0.007465017
#> 2 -0.1858829  0.09897490 -0.011609765
#> 3  0.5382256 -0.04571766  0.024009623
df1$groupkm <- km_res$cluster

g1<- df1[df1$groupkm==1,]
g2<- df1[df1$groupkm==2,]
g3<- df1[df1$groupkm==2,]

1.5.2 K-means+GA

set.seed(8)
objective_function <- function(params) {
  k <- round(params)
  if (k < 2) k <- 2 
  return(-calculate_wss(k, results)) 
}

calculate_wss <- function(k, results) {
  kmeans_result <- kmeans(results, centers = k, nstart = 100)
  return(kmeans_result$tot.withinss)
}

ga_result <- ga(
  type = "real-valued",
  fitness = objective_function,
  lower = 1,  
  upper = 10,    
  run = 10,
  parallel = TRUE,
  monitor = if(interactive()) gaMonitor else FALSE
)

best_k <- round(ga_result@solution[1])

optimal_kmeans <- kmeans(results, centers = best_k, nstart = 100)

fviz_cluster(optimal_kmeans, data = results)+
        ggtitle("GACluster for result of pca") +
    theme_minimal()

## Realization of clusters
df1$group_GA <-optimal_kmeans$cluster

g1_GA<- df1[df1$group_GA==1,]
g2_GA<- df1[df1$group_GA==2,]
g3_GA<- df1[df1$group_GA==3,]
g4_GA<- df1[df1$group_GA==4,]
g5_GA<- df1[df1$group_GA==5,]

1.5.2.1 Summary table for each gacluster

# Create summary table for each cluster
cluster_summary <- df1 %>%
  group_by(group_GA) %>%
  summarise(
    count = n(),
    loan_amnt_min = min(loan_amnt_woe),
    loan_amnt_max = max(loan_amnt_woe),
    loan_amnt_mean = mean(loan_amnt_woe),
    funded_amnt_min = min(funded_amnt_woe),
    funded_amnt_max = max(funded_amnt_woe),
    funded_amnt_mean = mean(funded_amnt_woe),
    sub_grade_num_min = min(sub_grade_num_woe),
    sub_grade_num_max = max(sub_grade_num_woe),
    emp_length_num_min = min(emp_length_num_woe),
    emp_length_num_max = max(emp_length_num_woe),
    dti_min = min(dti_woe),
    dti_max = max(dti_woe),
    dti_mean = mean(dti_woe),
    revol_util_min = min(revol_util_woe),
    revol_util_max = max(revol_util_woe),
    revol_util_mean = mean(revol_util_woe),
  )

# Display the summary
cluster_summary <- t(cluster_summary)
print(cluster_summary)
#>                             [,1]          [,2]          [,3]          [,4]
#> group_GA            1.000000e+00    2.00000000    3.00000000    4.00000000
#> count               2.754000e+03 4114.00000000 3930.00000000 3020.00000000
#> loan_amnt_min      -1.783359e-01   -0.17833591   -0.17833591   -0.17833591
#> loan_amnt_max       2.374314e-01    0.23743139    0.23743139    0.23743139
#> loan_amnt_mean      2.808564e-02   -0.05250190   -0.05468618   -0.07178379
#> funded_amnt_min    -1.741437e-01   -0.17414371   -0.17414371   -0.17414371
#> funded_amnt_max     2.367068e-01    0.23670680    0.23670680    0.23670680
#> funded_amnt_mean    2.642411e-02   -0.05151174   -0.05440339   -0.07256304
#> sub_grade_num_min  -4.653398e-02   -0.04653398   -0.04653398   -0.04653398
#> sub_grade_num_max   2.524326e-02    0.02524326    0.02524326    0.02524326
#> emp_length_num_min -2.886748e-02   -0.02886748   -0.02886748   -0.02886748
#> emp_length_num_max  1.619973e-01    0.16199733    0.16199733    0.16199733
#> dti_min            -3.666917e-01   -0.36669167   -0.36669167   -0.36669167
#> dti_max             5.429565e-01    0.54295647    0.54295647    0.54295647
#> dti_mean           -7.408257e-03   -0.12762850   -0.14090143   -0.01885170
#> revol_util_min     -5.472870e-01   -0.54728702   -0.54728702   -0.54728702
#> revol_util_max      4.181434e-01    0.06930215    0.06930215    0.41814344
#> revol_util_mean     1.352461e-01   -0.29891951   -0.33697117   -0.02194559
#>                             [,5]          [,6]          [,7]          [,8]
#> group_GA              5.00000000    6.00000000    7.00000000    8.00000000
#> count              5555.00000000 1314.00000000 3759.00000000 7801.00000000
#> loan_amnt_min        -0.17833591   -0.17833591   -0.17833591   -0.17833591
#> loan_amnt_max         0.23743139    0.23743139    0.23743139    0.23743139
#> loan_amnt_mean        0.08406479    0.09077796    0.13944395   -0.04227538
#> funded_amnt_min      -0.17414371   -0.17414371   -0.17414371   -0.17414371
#> funded_amnt_max       0.23670680    0.23670680    0.23670680    0.23670680
#> funded_amnt_mean      0.08422553    0.09081283    0.13933902   -0.04113021
#> sub_grade_num_min    -0.04653398   -0.04653398   -0.04653398   -0.04653398
#> sub_grade_num_max     0.02524326    0.02524326    0.02524326    0.02524326
#> emp_length_num_min   -0.02886748   -0.02886748   -0.02886748   -0.02886748
#> emp_length_num_max    0.16199733    0.16199733    0.16199733    0.16199733
#> dti_min              -0.36669167   -0.36669167   -0.36669167   -0.36669167
#> dti_max               0.54295647    0.54295647    0.54295647    0.54295647
#> dti_mean              0.06808741   -0.01370626    0.09041298    0.02192296
#> revol_util_min       -0.54728702   -0.54728702   -0.54728702   -0.18215209
#> revol_util_max        0.41814344    0.41814344    0.41814344    0.41814344
#> revol_util_mean       0.02208711   -0.23495464    0.11881948    0.20019597
#>                             [,9]         [,10]
#> group_GA              9.00000000   10.00000000
#> count              2975.00000000 5647.00000000
#> loan_amnt_min        -0.17833591   -0.17833591
#> loan_amnt_max         0.23743139    0.23743139
#> loan_amnt_mean       -0.09045170   -0.05363854
#> funded_amnt_min      -0.17414371   -0.17414371
#> funded_amnt_max       0.23670680    0.23670680
#> funded_amnt_mean     -0.09113556   -0.05291467
#> sub_grade_num_min    -0.04653398   -0.04653398
#> sub_grade_num_max     0.02524326    0.02524326
#> emp_length_num_min   -0.02886748   -0.02886748
#> emp_length_num_max    0.16199733    0.16199733
#> dti_min              -0.36669167   -0.36669167
#> dti_max               0.54295647    0.54295647
#> dti_mean             -0.18348016    0.01412957
#> revol_util_min       -0.54728702   -0.18215209
#> revol_util_max       -0.18215209    0.41814344
#> revol_util_mean      -0.50297989    0.14052850
cluster_summary_df <- as.data.frame(cluster_summary)

###table
#grade
grade_summary <- df1 %>%
  group_by(group_GA, grade_woe) %>%
  summarise(count = n(), .groups = 'drop') %>%
  pivot_wider(names_from = grade_woe, values_from = count, values_fill = list(count = 0))
kable(grade_summary, caption = "Counts of 'grade_woe' by Group")
Counts of ‘grade_woe’ by Group
group_GA 0.860425570044054 0.103685264830283 0.421220703081032 -0.319075617367842 -1.12236611095412
1 2754 0 0 0 0
2 166 2561 1387 0 0
3 0 0 0 3930 0
4 0 0 0 0 3020
5 0 2762 0 2793 0
6 0 0 0 0 1314
7 1787 0 1972 0 0
8 0 4746 3055 0 0
9 0 0 0 0 2975
10 0 0 0 5647 0
#short_emp
short_emp_summary <- df1 %>%
  group_by(group_GA, short_emp_woe) %>%
  summarise(count = n(), .groups = 'drop') %>%
  pivot_wider(names_from = short_emp_woe, values_from = count, values_fill = list(count = 0))
kable(short_emp_summary, caption = "Counts of 'short_emp_woe' by Group")
Counts of ‘short_emp_woe’ by Group
group_GA -0.0244209293279626 0.161997334092502
1 2466 288
2 3571 543
3 3411 519
4 2648 372
5 4835 720
6 1103 211
7 3269 490
8 6910 891
9 2543 432
10 5017 630
#purpose
purpose_summary <- df1 %>%
  group_by(group_GA, purpose_woe) %>%
  summarise(count = n(), .groups = 'drop') %>%
  pivot_wider(names_from = purpose_woe, values_from = count, values_fill = list(count = 0))
kable(purpose_summary, caption = "Counts of 'purpose_woe' by Group")
Counts of ‘purpose_woe’ by Group
group_GA -0.196831871058625 -0.196608100580901 -0.160680606700238 0.0273586138416367 0.315718834677194
1 305 184 182 1505 578
2 493 309 409 2025 878
3 710 318 400 1923 579
4 871 181 175 1521 272
5 1246 112 252 3622 323
6 297 61 81 753 122
7 592 98 156 2532 381
8 1607 405 404 4486 899
9 518 389 427 1153 488
10 1489 213 296 3204 445
#delinq_2yrs
delinq_2yrs_summary <- df1 %>%
  group_by(group_GA, delinq_2yrs_woe) %>%
  summarise(count = n(), .groups = 'drop') %>%
  pivot_wider(names_from = delinq_2yrs_woe, values_from = count, values_fill = list(count = 0))
kable(delinq_2yrs_summary, caption = "Counts of 'delinq_2yrs_woe' by Group")
Counts of ‘delinq_2yrs_woe’ by Group
group_GA -0.0108046746319754 0.0639753139202767
1 2176 578
2 3132 982
3 3325 605
4 2831 189
5 4981 574
6 1253 61
7 3117 642
8 6550 1251
9 2776 199
10 4938 709
kable(grade_summary, caption = "Counts of 'grade_woe' by Group")
Counts of ‘grade_woe’ by Group
group_GA 0.860425570044054 0.103685264830283 0.421220703081032 -0.319075617367842 -1.12236611095412
1 2754 0 0 0 0
2 166 2561 1387 0 0
3 0 0 0 3930 0
4 0 0 0 0 3020
5 0 2762 0 2793 0
6 0 0 0 0 1314
7 1787 0 1972 0 0
8 0 4746 3055 0 0
9 0 0 0 0 2975
10 0 0 0 5647 0
kable(short_emp_summary, caption = "Counts of 'short_emp_woe' by Group")
Counts of ‘short_emp_woe’ by Group
group_GA -0.0244209293279626 0.161997334092502
1 2466 288
2 3571 543
3 3411 519
4 2648 372
5 4835 720
6 1103 211
7 3269 490
8 6910 891
9 2543 432
10 5017 630
kable(purpose_summary, caption = "Counts of 'purpose_woe' by Group")
Counts of ‘purpose_woe’ by Group
group_GA -0.196831871058625 -0.196608100580901 -0.160680606700238 0.0273586138416367 0.315718834677194
1 305 184 182 1505 578
2 493 309 409 2025 878
3 710 318 400 1923 579
4 871 181 175 1521 272
5 1246 112 252 3622 323
6 297 61 81 753 122
7 592 98 156 2532 381
8 1607 405 404 4486 899
9 518 389 427 1153 488
10 1489 213 296 3204 445
kable(delinq_2yrs_summary, caption = "Counts of 'delinq_2yrs_woe' by Group")
Counts of ‘delinq_2yrs_woe’ by Group
group_GA -0.0108046746319754 0.0639753139202767
1 2176 578
2 3132 982
3 3325 605
4 2831 189
5 4981 574
6 1253 61
7 3117 642
8 6550 1251
9 2776 199
10 4938 709

1.5.3 Hierarchical

# agglomative
# Dissimilarity matrix
df3 <- data.frame(results)
set.seed(9)
df4 <- df3 %>%
  sample_n(100)
d <- dist(df4, method = "euclidean")

# Hierarchical clustering using Complete Linkage
hc1 <- hclust(d, method = "ward.D" )
hc2 <- hclust(d, method = "complete" )

fviz_nbclust(df4, FUN = hcut, method = "wss")

gap_stat <- clusGap(df4, FUN = hcut, nstart = 50, K.max = 10, B = 50)
fviz_gap_stat(gap_stat)

# Plot the obtained dendrogram
plot(hc1, cex = 0.6, hang = -1)
rect.hclust(hc1, k = 6, border = 2:5)

plot(hc2, cex = 0.6, hang = -1)
rect.hclust(hc2, k = 6, border = 2:5)

# Cut tree into 3 groups
sub_grp1 <- cutree(hc1, k = 6)
sub_grp2 <- cutree(hc2, k = 6)

# Number of members in each cluster
table(sub_grp1)
#> sub_grp1
#>  1  2  3  4  5  6 
#> 19  7 28  9 18 19
table(sub_grp2)
#> sub_grp2
#>  1  2  3  4  5  6 
#> 34  7 24 10 17  8
fviz_cluster(list(data = df4, cluster = sub_grp1))+
        ggtitle("HCluster for Ward.D") +
    theme_minimal()

fviz_cluster(list(data = df4, cluster = sub_grp2))+
        ggtitle("HCluster for Complete") +
    theme_minimal()

# Create two dendrograms
dend1 <- as.dendrogram (hc1)
dend2 <- as.dendrogram (hc2)

tanglegram(dend1, dend2)

dend_list <- dendlist(dend1, dend2)

tanglegram(dend1, dend2,
           highlight_distinct_edges = FALSE, 
           common_subtrees_color_lines = FALSE, 
           common_subtrees_color_branches = TRUE, 
           main = paste("entanglement =", round(entanglement(dend_list), 2))
)

# divisive
# compute divisive hierarchical clustering
hc3 <- diana(df4)

# Divise coefficient; amount of clustering structure found
hc3$dc
#> [1] 0.9243702
# plot dendrogram
pltree(hc3, cex = 0.6, hang = -1, main = "Dendrogram of diana")

1.5.4 dbscan

set.seed(10)
# Perform DBSCAN clustering
dbscan_result <- dbscan(results, eps = 0.5, minPts = 5)

# Add cluster assignments to the original data
df1$groupdb <- as.factor(dbscan_result$cluster)

# Visualize the clusters
fviz_cluster(dbscan_result, data = results)+
            ggtitle("dbscanCluster for result of pca") +
    theme_minimal()

1.6 part6_Compare clusters

# Libraries
davies_bouldin_kmeans <- index.DB(results, km_res$cluster, centrotypes="centroids")

# Silhouette and Davies-Bouldin for DBSCAN
davies_bouldin_dbscan <- index.DB(results, dbscan_result$cluster, centrotypes="centroids")

# Silhouette and Davies-Bouldin for GA_kmeans
davies_bouldin_GA <- index.DB(results, optimal_kmeans$cluster, centrotypes="centroids")

# Davies-Bouldin Index values
davies_bouldin_kmeans_score <- davies_bouldin_kmeans$DB
davies_bouldin_dbscan_score <- davies_bouldin_dbscan$DB
davies_bouldin_GA_score <- davies_bouldin_GA$DB

# Create a data frame to store the results
comparison_table <- data.frame(
  Method = c(  "kmeans" , "DBSCAN" , "GA"),
  Davies_Bouldin_Index = c(davies_bouldin_kmeans_score, davies_bouldin_dbscan_score, davies_bouldin_GA_score)
)

# Display the table
knitr::kable(comparison_table, caption = "Comparison of Clustering Methods")
Comparison of Clustering Methods
Method Davies_Bouldin_Index
kmeans 1.2668721
DBSCAN 0.8394707
GA 1.0132065

1.7 part7_prediction

1.7.1 logit

set.seed(00)
index <- sample(2,size = nrow(df_train), replace = T , prob=c(0.7,0.3))
train <- df_train[index == 1, ]
test <- df_train[index == 2, ]
logit1 <- glm(formula = bad_loans ~ ., family = stats::binomial("logit"), data = train)
summary(logit1)
#> 
#> Call:
#> glm(formula = bad_loans ~ ., family = stats::binomial("logit"), 
#>     data = train)
#> 
#> Deviance Residuals: 
#>     Min       1Q   Median       3Q      Max  
#> -1.4443  -0.6890  -0.5277  -0.3346   2.5906  
#> 
#> Coefficients: (1 not defined because of singularities)
#>                             Estimate Std. Error z value Pr(>|z|)    
#> (Intercept)                 -1.44999    0.01837 -78.929  < 2e-16 ***
#> loan_amnt_woe                1.94050    0.74046   2.621 0.008776 ** 
#> funded_amnt_woe             -2.25716    0.75274  -2.999 0.002712 ** 
#> grade_woe                    0.85637    0.03408  25.131  < 2e-16 ***
#> sub_grade_num_woe            3.22561    0.61119   5.278 1.31e-07 ***
#> short_emp_woe               -0.13674    1.13185  -0.121 0.903843    
#> emp_length_num_woe           1.10872    1.10840   1.000 0.317167    
#> home_ownership_woe           0.43130    0.12808   3.368 0.000758 ***
#> dti_woe                      0.56028    0.06416   8.733  < 2e-16 ***
#> purpose_woe                  0.73032    0.10246   7.128 1.02e-12 ***
#> payment_inc_ratio_woe        0.77801    0.04835  16.090  < 2e-16 ***
#> delinq_2yrs_woe            132.46073 1301.84444   0.102 0.918956    
#> delinq_2yrs_zero_woe      -135.01281 1328.62828  -0.102 0.919060    
#> inq_last_6mths_woe           0.69061    0.08585   8.045 8.65e-16 ***
#> last_delinq_none_woe        -3.16821    1.72839  -1.833 0.066796 .  
#> last_major_derog_none_woe    3.83650    1.71383   2.239 0.025185 *  
#> open_acc_woe                 0.95787    0.35473   2.700 0.006929 ** 
#> pub_rec_woe                  6.16589    2.47378   2.493 0.012685 *  
#> pub_rec_zero_woe                  NA         NA      NA       NA    
#> revol_util_woe               0.33192    0.06034   5.501 3.78e-08 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> (Dispersion parameter for binomial family taken to be 1)
#> 
#>     Null deviance: 27928  on 28621  degrees of freedom
#> Residual deviance: 25795  on 28603  degrees of freedom
#> AIC: 25833
#> 
#> Number of Fisher Scoring iterations: 10
# Predictions on training and test data
train_pred1 <- predict(logit1, df_train, type = 'response')
#> Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
#> prediction from a rank-deficient fit may be misleading
test_pred1 <- predict(logit1, test, type = 'response')
#> Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
#> prediction from a rank-deficient fit may be misleading
test$test_pred1 <- test_pred1
df_train$train_pred1 <- train_pred1

# Optimal cutoff determination
pred1 <- prediction(train_pred1, df_train$bad_loans)
perform1 <- performance(pred1, "acc")
max1 <- which.max(slot(perform1, "y.values")[[1]])
prob1 <- slot(perform1, "x.values")[[1]][max1]
prob1
#>      6127 
#> 0.4876197
# AUC calculation for training data
auc_bin1 <- performance(pred1, "auc")
auc_bin1 <- unlist(slot(auc_bin1, "y.values"))
auc_bin1
#> [1] 0.6906972
# Confusion Matrix - Training Data
train_pred_class <- ifelse(train_pred1 > prob1, 1, 0)
tble_cf <- table(Predicted = train_pred_class, Actual = df_train$bad_loans)
tble_cf
#>          Actual
#> Predicted     0     1
#>         0 32805  7531
#>         1   253   280
# Classification Table - Training Data
TP <- tble_cf["1", "1"]
FP <- tble_cf["1", "0"]
FN <- tble_cf["0", "1"]
TN <- tble_cf["0", "0"]

Specificity <- TN / (TN + FP)
Sensitivity <- TP / (TP + FN)
Precision <- TP / (TP + FP)
classification_metrics_train <- data.frame(Specificity, Sensitivity, Precision)
classification_metrics_train
#>   Specificity Sensitivity Precision
#> 1   0.9923468  0.03584688 0.5253283
# Misclassification Error - Training Data
misclassification_error_train <- 1 - sum(diag(tble_cf)) / sum(tble_cf)
misclassification_error_train
#> [1] 0.1904622
# ROC and AUC - Training Data
roc_train <- roc(df_train$bad_loans, train_pred1)
#> Setting levels: control = 0, case = 1
#> Setting direction: controls < cases
roc_test <- roc(test$bad_loans, test_pred1)
#> Setting levels: control = 0, case = 1
#> Setting direction: controls < cases
# Plot ROC Curve
ggroc(roc_train) +
  ggtitle("ROC Curve for Training Data") +
  xlab("1 - Specificity") +
  ylab("Sensitivity")

ggroc(roc_test) +
  ggtitle("ROC Curve for Test Data") +
  xlab("1 - Specificity") +
  ylab("Sensitivity")

# AUC Calculation
auc_train <- auc(roc_train)
auc_test <- auc(roc_test)
auc_train
#> Area under the curve: 0.6907
auc_test
#> Area under the curve: 0.6836

1.7.2 logit predicyion on each groupkm

1.7.2.1 Group1

df_train <- dplyr::select(df_train, -train_pred1)
df_train$groupkm <- km_res$cluster
segment1<- df_train[df_train$groupkm==1,]
segment1 <- segment1[,1:21]

set.seed(1111)
index1 <- sample(2,size = nrow(segment1), replace = T , prob=c(0.7,0.3))
train1 <- segment1[index1 == 1, ]
test1 <- segment1[index1 == 2, ]

logit_Seg1 <- glm(formula = bad_loans ~ ., family = stats::binomial("logit"), data = train1)
summary(logit_Seg1)
#> 
#> Call:
#> glm(formula = bad_loans ~ ., family = stats::binomial("logit"), 
#>     data = train1)
#> 
#> Deviance Residuals: 
#>     Min       1Q   Median       3Q      Max  
#> -0.8684  -0.4241  -0.3519  -0.2834   2.7032  
#> 
#> Coefficients: (3 not defined because of singularities)
#>                            Estimate Std. Error z value Pr(>|z|)    
#> (Intercept)                 -2.2293     0.1070 -20.841  < 2e-16 ***
#> loan_amnt_woe               -4.5146     3.8721  -1.166  0.24364    
#> funded_amnt_woe              3.3370     3.9311   0.849  0.39595    
#> grade_woe                        NA         NA      NA       NA    
#> sub_grade_num_woe            5.2655     2.7570   1.910  0.05615 .  
#> short_emp_woe               -1.4831     3.8473  -0.385  0.69988    
#> emp_length_num_woe           1.7646     3.7840   0.466  0.64098    
#> home_ownership_woe           0.4044     0.4517   0.895  0.37062    
#> dti_woe                      0.4234     0.2421   1.748  0.08038 .  
#> purpose_woe                  1.1245     0.3354   3.353  0.00080 ***
#> payment_inc_ratio_woe        1.0673     0.1773   6.020 1.75e-09 ***
#> delinq_2yrs_woe            143.0207  4342.6574   0.033  0.97373    
#> delinq_2yrs_zero_woe      -143.0564  4432.0016  -0.032  0.97425    
#> inq_last_6mths_woe           0.8947     0.3281   2.727  0.00639 ** 
#> last_delinq_none_woe         8.3971     6.8792   1.221  0.22222    
#> last_major_derog_none_woe    3.6373     9.8150   0.371  0.71095    
#> open_acc_woe                 0.5689     1.2771   0.445  0.65598    
#> pub_rec_woe                 19.5928    20.9738   0.934  0.35022    
#> pub_rec_zero_woe                 NA         NA      NA       NA    
#> revol_util_woe               0.8619     0.2135   4.038 5.39e-05 ***
#> groupkm                          NA         NA      NA       NA    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> (Dispersion parameter for binomial family taken to be 1)
#> 
#>     Null deviance: 2635.8  on 5047  degrees of freedom
#> Residual deviance: 2531.2  on 5030  degrees of freedom
#> AIC: 2567.2
#> 
#> Number of Fisher Scoring iterations: 11
# Predictions on training and test data
trainpred_seg1 <- predict(logit_Seg1, train1, type = 'response')
#> Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
#> prediction from a rank-deficient fit may be misleading
testpred_seg1 <- predict(logit_Seg1, test1, type = 'response')
#> Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
#> prediction from a rank-deficient fit may be misleading
test1$testpred_seg1 <- testpred_seg1
train1$trainpred_seg1 <- trainpred_seg1

# Optimal cutoff determination
pred1_seg <- prediction(trainpred_seg1, train1$bad_loans)
performseg1 <- performance(pred1_seg, "acc")
max1 <- which.max(slot(performseg1, "y.values")[[1]])
probseg1 <- slot(performseg1, "x.values")[[1]][max1]
probseg1
#>     
#> Inf
# AUC calculation for training data
auc_binseg1 <- performance(pred1_seg, "auc")
auc_binseg1 <- unlist(slot(auc_binseg1, "y.values"))
auc_binseg1
#> [1] 0.6562001
# Confusion Matrix - Training Data
trainseg_pred_class <- ifelse(trainpred_seg1 > prob1, 1, 0)
tble_trainseg <- table(Predicted = trainseg_pred_class, Actual = train1$bad_loans)
tble_trainseg
#>          Actual
#> Predicted    0    1
#>         0 4680  368
# ROC and AUC - Training Data
roc_trainseg <- roc(train1$bad_loans, trainpred_seg1)
#> Setting levels: control = 0, case = 1
#> Setting direction: controls < cases
roc_testseg <- roc(test1$bad_loans, testpred_seg1)
#> Setting levels: control = 0, case = 1
#> Setting direction: controls < cases
# Plot ROC Curve
ggroc(roc_trainseg) +
  ggtitle("ROC Curve for Trainingseg Data") +
  xlab("1 - Specificityseg") +
  ylab("Sensitivityseg")

ggroc(roc_testseg) +
  ggtitle("ROC Curve for Testseg Data") +
  xlab("1 - Specificityseg") +
  ylab("Sensitivityseg")

# AUC Calculation
auc_trainseg <- auc(roc_trainseg)
auc_testseg <- auc(roc_testseg)
auc_trainseg
#> Area under the curve: 0.6562
auc_testseg
#> Area under the curve: 0.6175
hist(trainpred_seg1)

1.7.2.2 Group2

segment2<- df_train[df_train$groupkm==2,]
segment2 <- segment2[,1:21]

set.seed(2222)
index2 <- sample(2,size = nrow(segment2), replace = T , prob=c(0.7,0.3))
train2 <- segment2[index2 == 1, ]
test2 <- segment2[index2 == 2, ]

logit_Seg2 <- glm(formula = bad_loans ~ ., family = stats::binomial("logit"), data = train2)
summary(logit_Seg2)
#> 
#> Call:
#> glm(formula = bad_loans ~ ., family = stats::binomial("logit"), 
#>     data = train2)
#> 
#> Deviance Residuals: 
#>     Min       1Q   Median       3Q      Max  
#> -1.0589  -0.6174  -0.5347  -0.4456   2.4432  
#> 
#> Coefficients: (2 not defined because of singularities)
#>                             Estimate Std. Error z value Pr(>|z|)    
#> (Intercept)                 -1.48995    0.03425 -43.499  < 2e-16 ***
#> loan_amnt_woe                2.65206    1.06644   2.487 0.012888 *  
#> funded_amnt_woe             -3.03155    1.08883  -2.784 0.005366 ** 
#> grade_woe                    0.74485    0.11718   6.356 2.07e-10 ***
#> sub_grade_num_woe            4.11972    0.96288   4.279 1.88e-05 ***
#> short_emp_woe                0.89722    1.69348   0.530 0.596247    
#> emp_length_num_woe           0.45522    1.66172   0.274 0.784127    
#> home_ownership_woe           0.44072    0.19420   2.269 0.023244 *  
#> dti_woe                      0.65359    0.10061   6.496 8.23e-11 ***
#> purpose_woe                  0.56190    0.15509   3.623 0.000291 ***
#> payment_inc_ratio_woe        0.71880    0.08655   8.305  < 2e-16 ***
#> delinq_2yrs_woe            151.66043 2160.39968   0.070 0.944034    
#> delinq_2yrs_zero_woe      -153.42792 2204.84728  -0.070 0.944523    
#> inq_last_6mths_woe           0.67703    0.13302   5.090 3.59e-07 ***
#> last_delinq_none_woe        -6.81050    2.62895  -2.591 0.009582 ** 
#> last_major_derog_none_woe    5.07937    2.63502   1.928 0.053900 .  
#> open_acc_woe                 1.44641    0.53928   2.682 0.007316 ** 
#> pub_rec_woe                  5.33259    3.64037   1.465 0.142962    
#> pub_rec_zero_woe                  NA         NA      NA       NA    
#> revol_util_woe               0.28205    0.08855   3.185 0.001447 ** 
#> groupkm                           NA         NA      NA       NA    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> (Dispersion parameter for binomial family taken to be 1)
#> 
#>     Null deviance: 11953  on 13825  degrees of freedom
#> Residual deviance: 11662  on 13807  degrees of freedom
#> AIC: 11700
#> 
#> Number of Fisher Scoring iterations: 11
# Predictions on training and test data
trainpred_seg2 <- predict(logit_Seg2, train2, type = 'response')
#> Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
#> prediction from a rank-deficient fit may be misleading
testpred_seg2 <- predict(logit_Seg2, test2, type = 'response')
#> Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
#> prediction from a rank-deficient fit may be misleading
test2$testpred_seg2 <- testpred_seg2
train2$trainpred_seg2 <- trainpred_seg2


# Optimal cutoff determination
pred2_seg <- prediction(trainpred_seg2, train2$bad_loans)
performseg2 <- performance(pred2_seg, "acc")
max2 <- which.max(slot(performseg2, "y.values")[[1]])
probseg2 <- slot(performseg2, "x.values")[[1]][max2]
probseg2
#>      5979 
#> 0.3734171
# AUC calculation for training data
auc_binseg2 <- performance(pred2_seg, "auc")
auc_binseg2 <- unlist(slot(auc_binseg2, "y.values"))
auc_binseg2
#> [1] 0.6109219
# Confusion Matrix - Training Data
trainseg2_pred_class <- ifelse(trainpred_seg2 > prob1, 1, 0)
tble_trainseg2 <- table(Predicted = trainseg2_pred_class, Actual = train2$bad_loans)
tble_trainseg2
#>          Actual
#> Predicted     0     1
#>         0 11675  2151
# ROC and AUC - Training Data
roc_trainseg2 <- roc(train2$bad_loans, trainpred_seg2)
#> Setting levels: control = 0, case = 1
#> Setting direction: controls < cases
roc_testseg2 <- roc(test2$bad_loans, testpred_seg2)
#> Setting levels: control = 0, case = 1
#> Setting direction: controls < cases
# AUC Calculation
auc_trainseg2 <- auc(roc_trainseg2)
auc_testseg2 <- auc(roc_testseg2)
auc_trainseg2
#> Area under the curve: 0.6109
auc_testseg2
#> Area under the curve: 0.6038
hist(trainpred_seg2)

1.7.2.3 Group3

segment3<- df_train[df_train$groupkm==3,]
segment3 <- segment3[,1:21]
set.seed(3333)
index3 <- sample(2,size = nrow(segment3), replace = T , prob=c(0.7,0.3))
train3 <- segment3[index3 == 1, ]
test3 <- segment3[index3 == 2, ]

logit_Seg3 <- glm(formula = bad_loans ~ ., family = stats::binomial("logit"), data = train3)
summary(logit_Seg3)
#> 
#> Call:
#> glm(formula = bad_loans ~ ., family = stats::binomial("logit"), 
#>     data = train3)
#> 
#> Deviance Residuals: 
#>     Min       1Q   Median       3Q      Max  
#> -1.3902  -0.8672  -0.7407   1.3266   2.0066  
#> 
#> Coefficients: (3 not defined because of singularities)
#>                           Estimate Std. Error z value Pr(>|z|)    
#> (Intercept)               -1.40018    0.05227 -26.790  < 2e-16 ***
#> loan_amnt_woe              1.77443    1.11187   1.596  0.11051    
#> funded_amnt_woe           -2.10345    1.12636  -1.867  0.06184 .  
#> grade_woe                  0.76927    0.08293   9.276  < 2e-16 ***
#> sub_grade_num_woe          2.56629    0.84299   3.044  0.00233 ** 
#> short_emp_woe             -0.80541    1.65932  -0.485  0.62740    
#> emp_length_num_woe         1.41109    1.62053   0.871  0.38388    
#> home_ownership_woe         0.51826    0.18548   2.794  0.00520 ** 
#> dti_woe                    0.43239    0.08990   4.810 1.51e-06 ***
#> purpose_woe                0.72266    0.15089   4.789 1.67e-06 ***
#> payment_inc_ratio_woe      0.80587    0.06940  11.613  < 2e-16 ***
#> delinq_2yrs_woe            0.57485    0.92722   0.620  0.53528    
#> delinq_2yrs_zero_woe            NA         NA      NA       NA    
#> inq_last_6mths_woe         0.62154    0.12073   5.148 2.63e-07 ***
#> last_delinq_none_woe      -6.28019    2.47936  -2.533  0.01131 *  
#> last_major_derog_none_woe -0.22212    2.32830  -0.095  0.92400    
#> open_acc_woe               0.57507    0.50910   1.130  0.25865    
#> pub_rec_woe                2.82908    3.42105   0.827  0.40826    
#> pub_rec_zero_woe                NA         NA      NA       NA    
#> revol_util_woe             0.29099    0.09412   3.092  0.00199 ** 
#> groupkm                         NA         NA      NA       NA    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> (Dispersion parameter for binomial family taken to be 1)
#> 
#>     Null deviance: 11828  on 9635  degrees of freedom
#> Residual deviance: 11502  on 9618  degrees of freedom
#> AIC: 11538
#> 
#> Number of Fisher Scoring iterations: 4
# Predictions on training and test data
trainpred_seg3 <- predict(logit_Seg3, train3, type = 'response')
#> Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
#> prediction from a rank-deficient fit may be misleading
testpred_seg3 <- predict(logit_Seg3, test3, type = 'response')
#> Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
#> prediction from a rank-deficient fit may be misleading
test3$testpred_seg3 <- testpred_seg3
train3$trainpred_seg3 <- trainpred_seg3


# Optimal cutoff determination
pred3_seg <- prediction(trainpred_seg3, train3$bad_loans)
performseg3 <- performance(pred3_seg, "acc")
max3 <- which.max(slot(performseg3, "y.values")[[1]])
probseg3 <- slot(performseg3, "x.values")[[1]][max3]
probseg3
#>      6143 
#> 0.4546298
# AUC calculation for training data
auc_binseg3 <- performance(pred3_seg, "auc")
auc_binseg3 <- unlist(slot(auc_binseg3, "y.values"))
auc_binseg3
#> [1] 0.6112667
# Confusion Matrix - Training Data
trainseg3_pred_class <- ifelse(trainpred_seg3 > prob1, 1, 0)
tble_trainseg3 <- table(Predicted = trainseg3_pred_class, Actual = train3$bad_loans)
tble_trainseg3
#>          Actual
#> Predicted    0    1
#>         0 6582 2777
#>         1  130  147
# ROC and AUC - Training Data
roc_trainseg3 <- roc(train3$bad_loans, trainpred_seg3)
#> Setting levels: control = 0, case = 1
#> Setting direction: controls < cases
roc_testseg3 <- roc(test3$bad_loans, testpred_seg3)
#> Setting levels: control = 0, case = 1
#> Setting direction: controls < cases
# Plot ROC Curve
ggroc(roc_trainseg3) +
  ggtitle("ROC Curve for Trainingseg3 Data") +
  xlab("1 - Specificityseg3") +
  ylab("Sensitivityseg3")

ggroc(roc_testseg3) +
  ggtitle("ROC Curve for Testseg3 Data") +
  xlab("1 - Specificityseg3") +
  ylab("Sensitivityseg3")

# AUC Calculation
auc_trainseg3 <- auc(roc_trainseg3)
auc_testseg3 <- auc(roc_testseg3)
auc_trainseg3
#> Area under the curve: 0.6113
auc_testseg3
#> Area under the curve: 0.6179

1.7.2.4 compare results_logitmodel

count_seg1 <- as.numeric(count(segment1))
count_seg2 <- as.numeric(count(segment2))
count_seg3 <- as.numeric(count(segment3))

# Create a data frame with the accuracy metrics for each group
results_logitmodel <- data.frame(
  Group = c("Group 1", "Group 2", "Group 3"),
  AUC_Train = c(auc_trainseg, auc_trainseg2, auc_trainseg3),
  AUC_Test = c(auc_testseg, auc_testseg2, auc_testseg3),
  count = c(count_seg1,count_seg2,count_seg3)
)

# Print the results table
print(results_logitmodel)
#>     Group AUC_Train  AUC_Test count
#> 1 Group 1 0.6562001 0.6175226  7309
#> 2 Group 2 0.6109219 0.6037607 19836
#> 3 Group 3 0.6112667 0.6179009 13724

1.7.3 XG_boost

1.7.3.1 Group1

# Convert data to DMatrix format for XGBoost
trainxg_matrix1 <- xgb.DMatrix(data = as.matrix(train1[, -1]), label = train1$bad_loans)
testxg_matrix1 <- xgb.DMatrix(data = as.matrix(test1[, -1]), label = test1$bad_loans)

# Set parameters for XGBoost
params <- list(objective = "binary:logistic", eval_metric = "auc")

# Train the model
xgb_model1 <- xgboost(params = params, data = trainxg_matrix1, nrounds = 100, verbose = 0)

# Make predictions
train_pred_xgb1 <- predict(xgb_model1, trainxg_matrix1)

# Calculate AUC for train and test data
auc_train_xgb1 <- auc(train1$bad_loans, train_pred_xgb1)
#> Setting levels: control = 0, case = 1
#> Setting direction: controls < cases
# Confusion Matrix and Metrics
train_pred_class_xgb1 <- ifelse(train_pred_xgb1 > 0.5, 1, 0)

# Training metrics
confusion_matrix_train_xgb1 <- table(Predicted = train_pred_class_xgb1, Actual = train1$bad_loans)
TP_train_xgb1 <- confusion_matrix_train_xgb1["1", "1"]
FP_train_xgb1 <- confusion_matrix_train_xgb1["1", "0"]
FN_train_xgb1 <- confusion_matrix_train_xgb1["0", "1"]
TN_train_xgb1 <- confusion_matrix_train_xgb1["0", "0"]

Specificity_train_xgb1 <- TN_train_xgb1 / (TN_train_xgb1 + FP_train_xgb1)
Sensitivity_train_xgb1 <- TP_train_xgb1 / (TP_train_xgb1 + FN_train_xgb1)
Precision_train_xgb1 <- TP_train_xgb1 / (TP_train_xgb1 + FP_train_xgb1)

1.7.3.2 Group2

# Convert data to DMatrix format for XGBoost
trainxg_matrix2 <- xgb.DMatrix(data = as.matrix(train2[, -1]), label = train2$bad_loans)
testxg_matrix2 <- xgb.DMatrix(data = as.matrix(test2[, -1]), label = test2$bad_loans)

# Set parameters for XGBoost
params <- list(objective = "binary:logistic", eval_metric = "auc")

# Train the model
xgb_model2 <- xgboost(params = params, data = trainxg_matrix2, nrounds = 100, verbose = 0)

# Make predictions
train_pred_xgb2 <- predict(xgb_model2, trainxg_matrix2)

# Calculate AUC for train and test data
auc_train_xgb2 <- auc(train2$bad_loans, train_pred_xgb2)
#> Setting levels: control = 0, case = 1
#> Setting direction: controls < cases
# Confusion Matrix and Metrics
train_pred_class_xgb2 <- ifelse(train_pred_xgb2 > 0.5, 1, 0)

# Training metrics
confusion_matrix_train_xgb2 <- table(Predicted = train_pred_class_xgb2, Actual = train2$bad_loans)
TP_train_xgb2 <- confusion_matrix_train_xgb2["1", "1"]
FP_train_xgb2 <- confusion_matrix_train_xgb2["1", "0"]
FN_train_xgb2 <- confusion_matrix_train_xgb2["0", "1"]
TN_train_xgb2 <- confusion_matrix_train_xgb2["0", "0"]

Specificity_train_xgb2 <- TN_train_xgb2 / (TN_train_xgb2 + FP_train_xgb2)
Sensitivity_train_xgb2 <- TP_train_xgb2 / (TP_train_xgb2 + FN_train_xgb2)
Precision_train_xgb2 <- TP_train_xgb2 / (TP_train_xgb2 + FP_train_xgb2)

1.7.3.3 Group3

# Convert data to DMatrix format for XGBoost
trainxg_matrix3 <- xgb.DMatrix(data = as.matrix(train3[, -1]), label = train3$bad_loans)
testxg_matrix3 <- xgb.DMatrix(data = as.matrix(test3[, -1]), label = test3$bad_loans)

# Set parameters for XGBoost
params <- list(objective = "binary:logistic", eval_metric = "auc")

# Train the model
xgb_model3 <- xgboost(params = params, data = trainxg_matrix3, nrounds = 100, verbose = 0)

# Make predictions
train_pred_xgb3 <- predict(xgb_model3, trainxg_matrix3)

# Calculate AUC for train and test data
auc_train_xgb3 <- auc(train3$bad_loans, train_pred_xgb3)
#> Setting levels: control = 0, case = 1
#> Setting direction: controls < cases
# Confusion Matrix and Metrics
train_pred_class_xgb3 <- ifelse(train_pred_xgb3 > 0.5, 1, 0)

# Training metrics
confusion_matrix_train_xgb3 <- table(Predicted = train_pred_class_xgb3, Actual = train3$bad_loans)
TP_train_xgb3 <- confusion_matrix_train_xgb3["1", "1"]
FP_train_xgb3 <- confusion_matrix_train_xgb3["1", "0"]
FN_train_xgb3 <- confusion_matrix_train_xgb3["0", "1"]
TN_train_xgb3 <- confusion_matrix_train_xgb3["0", "0"]

Specificity_train_xgb3 <- TN_train_xgb3 / (TN_train_xgb3 + FP_train_xgb3)
Sensitivity_train_xgb3 <- TP_train_xgb3 / (TP_train_xgb3 + FN_train_xgb3)
Precision_train_xgb3 <- TP_train_xgb3 / (TP_train_xgb3 + FP_train_xgb3)

1.7.3.4 compare results_xgb

# Create a data frame with the accuracy metrics for each group
results_xgb <- data.frame(
  Group = c("Group 1", "Group 2", "Group 3"),
  AUC_Train = c(auc_train_xgb1, auc_train_xgb2, auc_train_xgb3),
  count = c(count_seg1,count_seg2,count_seg3)
)

# Print the results table
print(results_xgb)
#>     Group AUC_Train count
#> 1 Group 1 0.9935793  7309
#> 2 Group 2 0.9105632 19836
#> 3 Group 3 0.9275514 13724

1.7.4 compare results of each model

# Combine the results from both models into one data frame
combined_results_models <- data.frame(
  Model = rep(c("logit", "XGBoost"), each = 3),
  Group = rep(c("Group 1", "Group 2", "Group 3"), times = 2),
  AUC_Train = c(auc_trainseg, auc_trainseg2, auc_trainseg3,
                auc_train_xgb1, auc_train_xgb2, auc_train_xgb3),
  Count = c(count_seg1, count_seg2, count_seg3)
)

# Print the combined results table
print(combined_results_models)
#>     Model   Group AUC_Train Count
#> 1   logit Group 1 0.6562001  7309
#> 2   logit Group 2 0.6109219 19836
#> 3   logit Group 3 0.6112667 13724
#> 4 XGBoost Group 1 0.9935793  7309
#> 5 XGBoost Group 2 0.9105632 19836
#> 6 XGBoost Group 3 0.9275514 13724

1.8 part6_Ranking

coefficients_logit <- coef(logit1)
print(coefficients_logit)
#>               (Intercept)             loan_amnt_woe           funded_amnt_woe 
#>                -1.4499876                 1.9404957                -2.2571601 
#>                 grade_woe         sub_grade_num_woe             short_emp_woe 
#>                 0.8563656                 3.2256056                -0.1367362 
#>        emp_length_num_woe        home_ownership_woe                   dti_woe 
#>                 1.1087247                 0.4312961                 0.5602752 
#>               purpose_woe     payment_inc_ratio_woe           delinq_2yrs_woe 
#>                 0.7303181                 0.7780070               132.4607330 
#>      delinq_2yrs_zero_woe        inq_last_6mths_woe      last_delinq_none_woe 
#>              -135.0128108                 0.6906149                -3.1682093 
#> last_major_derog_none_woe              open_acc_woe               pub_rec_woe 
#>                 3.8364974                 0.9578692                 6.1658912 
#>          pub_rec_zero_woe            revol_util_woe 
#>                        NA                 0.3319224
weight_grade <- coefficients_logit['grade_woe']
weight_emp_length_num_woe <- coefficients_logit['emp_length_num_woe']
weight_loan_amnt_woe <- coefficients_logit['loan_amnt_woe']
weight_delinq_2yrs_zero_woe <- coefficients_logit['delinq_2yrs_zero_woe']

cluster_summary <- df1 %>%
  group_by(group_GA) %>%
  summarise(
    grade_woe = mean(grade_woe),
    emp_length_num_woe = mean(emp_length_num_woe),
    loan_amnt_woe = mean(loan_amnt_woe),
    groups = 'drop'
  ) %>%
  mutate(
    composite_score = grade_woe * weight_grade +
      emp_length_num_woe * weight_emp_length_num_woe +
      loan_amnt_woe * weight_loan_amnt_woe
  ) %>%
  arrange(desc(composite_score))
print(cluster_summary)
#> # A tibble: 10 × 6
#>    group_GA grade_woe emp_length_num_woe loan_amnt_woe groups composite_score
#>       <int>     <dbl>              <dbl>         <dbl> <chr>            <dbl>
#>  1        7     0.630          -0.000535        0.139  drop            0.810 
#>  2        1     0.860          -0.00511         0.0281 drop            0.786 
#>  3        8     0.228          -0.00326        -0.0423 drop            0.110 
#>  4        2     0.241           0.000562       -0.0525 drop            0.105 
#>  5        5    -0.109          -0.000299        0.0841 drop            0.0696
#>  6        3    -0.319           0.000533       -0.0547 drop           -0.379 
#>  7       10    -0.319          -0.00410        -0.0536 drop           -0.382 
#>  8        6    -1.12            0.00564         0.0908 drop           -0.779 
#>  9        4    -1.12           -0.00178        -0.0718 drop           -1.10  
#> 10        9    -1.12            0.00313        -0.0905 drop           -1.13